I reproduced the ScRNA analysis for a Nat Comm paper: https://www.nature.com/articles/s41467-020-14296-y#Sec2
Wang, X., Xu, H., Cheng, C., Ji, Z., Zhao, H., Sheng, Y., … & Zhu, H. H. (2020). Identification of a Zeb1 expressing basal stem cell subpopulation in the prostate. Nature communications, 11(1), 1-16.
Here’s my modified and updated script to reproduce the works.
#setwd("R_HelenHeZhu_StemCell")
###############################################################################
### Step01 removal of poor quality cells and contaminated non-epithelial cells
###############################################################################
library(Seurat)
library(dplyr)
library(Matrix)
library(ggpubr)
#
data <- Read10X(data.dir = "GSE111429_RAW")
PG_all <- CreateSeuratObject(count = data, min.cells = 0, min.genes = 0, project = "PG")
PG_all <- PercentageFeatureSet(PG_all, pattern = "^mt-", col.name = "percent.mt")
before <- VlnPlot(PG_all, features =c('nFeature_RNA','nCount_RNA','percent.mt'))
# Poor-quality cells with less than 1000 genes detected, less than 5000 UMIs or more than 5% UMI mapped to mitochondria genes were removed.
PG_all <- subset(PG_all, subset = nFeature_RNA > 1400 & nCount_RNA > 5000 & percent.mt < 5)
after <- VlnPlot(PG_all, features =c('nFeature_RNA','nCount_RNA','percent.mt'))
ggarrange(before,after, ncol = 2, nrow = 1)
#
PG_all <- NormalizeData(PG_all, normalization.method='LogNormalize', scale.factor=1e4)
PG_all <- FindVariableFeatures(PG_all,
nfeatures = 1606,
mean.cutoff = c(0.0125,3),
dispersion.cutoff = c(0.5, Inf))
length(PG_all@assays$RNA@var.features)
## [1] 1606
summary(PG_all@assays$RNA@meta.features)
## vst.mean vst.variance vst.variance.expected
## Min. : 0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.0047 Median : 0.01 Median : 0.01
## Mean : 0.4062 Mean : 9.40 Mean : 10.72
## 3rd Qu.: 0.1242 3rd Qu.: 0.16 3rd Qu.: 0.16
## Max. :488.0069 Max. :75515.80 Max. :135656.59
## vst.variance.standardized vst.variable
## Min. : 0.0000 Mode :logical
## 1st Qu.: 0.0000 FALSE:26392
## Median : 0.8621 TRUE :1606
## Mean : 0.7906
## 3rd Qu.: 0.9670
## Max. :33.7765
#
PG_all <- ScaleData(PG_all, vars.to.regress=c('nCount_RNA','percent.mt'))
PG_all <- RunPCA(PG_all, features = PG_all@assays$RNA@var.features, verbose = TRUE,
ndims.print=1:5, nfeatures.print = 5)
PCAPlot(PG_all, dims = c(1, 2))
#
DimHeatmap(PG_all, dims = 1:12, cells = 500, balanced = TRUE)
#
PG_all <- FindNeighbors(PG_all, dims = 1:12)
PG_all <- FindClusters(PG_all, resolution=c(0.2,0.3,0.4,0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9833
## Number of edges: 323440
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9244
## Number of communities: 10
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9833
## Number of edges: 323440
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9059
## Number of communities: 11
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9833
## Number of edges: 323440
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8921
## Number of communities: 12
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9833
## Number of edges: 323440
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8794
## Number of communities: 12
## Elapsed time: 1 seconds
PG_all <- RunTSNE(PG_all, dims = 1:12, do.fast=TRUE)
Idents(object = PG_all) <- PG_all$RNA_snn_res.0.2
DimPlot(PG_all, reduction = "tsne", label = TRUE, pt.size = 1, label.size = 6)
PCAPlot(PG_all)
####
####
#### retrieve non-epithelial cells
FeaturePlot(PG_all, features =c('Cd74','Cd72','Cd54') , cols =c('yellow','red'),
reduction ='tsne', pt.size = 0.7,ncol = 3, label = T)
## Warning in FetchData(object = object, vars = c(dims, "ident", features), : The
## following requested variables were not found: Cd54
FeaturePlot(PG_all, features = c('Eng','S1pr1','Emcn'), cols =c('yellow','red'),
reduction = 'tsne',pt.size = 0.7,ncol=3, label = T)
#
#Immu.Cluster <- FindMarkers(PG_all, ident.1='9', only.pos=TRUE, min.pct=0.25)
#Endo.Cluster <- FindMarkers(PG_all, ident.1='4', only.pos=TRUE, min.pct=0.25)
#Immu.Cluster$Symbol <- rownames(Immu.Cluster)
#Endo.Cluster$Symbol <- rownames(Endo.Cluster)
#
v1 <- VlnPlot(PG_all, features = c("Epcam","Cdh1","Krt5", "Krt14"), ncol = 4) #Epithelial markers
v2 <- VlnPlot(PG_all, features = c("Igfbp4","Fn1","Gng11"), ncol = 3) # stromal markers
v3 <- VlnPlot(PG_all, features = c("Eng","S1pr1","Emcn"), ncol = 3) # endothelial markers
v4 <- VlnPlot(PG_all, features = c("Cd74","Cd72"), ncol = 2) #immune markers
ggarrange(v1, v2, v3, v4, ncol = 1)
#
###
### Draw heatmap
###
library(ComplexHeatmap)
library(circlize)
#
geneSets <- c('Cd74','Cd72','Cd54','Eng','S1pr1','Emcn')
cellRanks <- PG_all@meta.data[order(PG_all@meta.data$RNA_snn_res.0.2),]
PartialMatrix <- PG_all@assays$RNA@scale.data[which(rownames(PG_all@assays$RNA@scale.data) %in% geneSets), rownames(cellRanks)]
cellRanks$col <- rainbow(max(as.numeric(cellRanks$RNA_snn_res.0.2))+1)[as.numeric(cellRanks$RNA_snn_res.0.5)+1]
#
ha_column <- HeatmapAnnotation(
df = data.frame(
ClusterID = as.numeric(cellRanks$RNA_snn_res.0.2)
),
col = list(
ClusterID = colorRamp2(unique(as.numeric(cellRanks$RNA_snn_res.0.2)),
rainbow(max(as.numeric(cellRanks$RNA_snn_res.0.2))))
)
)
ht1 = Heatmap(PartialMatrix, name = "Scaled\nUMI", column_title = "non-epithelial cells markers",
top_annotation = ha_column, show_column_names=FALSE, cluster_columns = FALSE,
row_names_gp = gpar(fontsize = 10))
draw(ht1)
# seurat function
### heatmap for non-epithelial cells markers
geneSets <- c('Cd74','Cd72','Cd54','Eng','S1pr1','Emcn')
DoHeatmap(PG_all, features = geneSets)
## Warning in DoHeatmap(PG_all, features = geneSets): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: Cd54,
## Cd72
################################################################################
### Step02 clustering using Seurat #############################################
################################################################################
#
PG_all <- subset(PG_all, idents = c("4","8"), invert = TRUE)
DefaultAssay(PG_all) <- "RNA"
PG_all <- NormalizeData(PG_all, normalization.method='LogNormalize', scale.factor=1e4)
PG_all <- FindVariableFeatures(PG_all, mean.function = ExpMean,
dispersion.function= LogVMR,
mean.cutoff = c(0.0125, 3),
dispersion.cutoff = c(0.5, Inf))
length(PG_all@assays$RNA@var.features)
## [1] 2000
#
PG_all <- ScaleData(PG_all, vars.to.regress=c('nCount_RNA','percent.mt'))
PG_all <- RunPCA(PG_all, features = PG_all@assays$RNA@var.features, verbose = TRUE,
ndims.print=1:5, nfeatures.print = 5)
PG_all <- FindNeighbors(PG_all, dims = 1:12)
PG_all <- FindClusters(PG_all, resolution=c(0.2,0.3,0.4,0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9316
## Number of edges: 302726
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9165
## Number of communities: 7
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9316
## Number of edges: 302726
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8963
## Number of communities: 10
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9316
## Number of edges: 302726
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8814
## Number of communities: 10
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 9316
## Number of edges: 302726
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8660
## Number of communities: 10
## Elapsed time: 0 seconds
PG_all <- RunTSNE(PG_all, dims = 1:12, do.fast=TRUE)
#saveRDS(PG_all, file = "PG_all.RDS")
################################################
PG_all <- readRDS("PG_all.RDS")
Idents(object = PG_all) <- PG_all$RNA_snn_res.0.5
DimPlot(PG_all, label = T, pt.size = 1, label.size = 6)
plot <- VlnPlot(PG_all, features = c("Krt5","Krt14"), combine = FALSE,
fill.by = c("feature","ident"))
plot <- lapply(
X = plot,
FUN = function(p) p + aes(color= PG_all$RNA_snn_res.0.5)
)
CombinePlots(plots = plot, legend = 'none')
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
################################################
# Doheatmap
listsA <- c('Cdh1','Epcam','Cldn1','Ocln','Vim','Fn1','S100a4','Zeb1','Zeb2',
'Twist1','Twist2','Snai1','Snai2','Prrx1','Prrx2','Foxc2','Cdh2','Acta2')
# Option 1
DoHeatmap(PG_all, features = listsA)
## Warning in DoHeatmap(PG_all, features = listsA): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay: Foxc2,
## Snai1, Twist1, Zeb1, Ocln, Cdh1
# Option 2: customise
DoHeatmap(PG_all, features = listsA, disp.min = -1,
slot = 'scale.data',
size = 3,
group.colors = rainbow(9)) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",midpoint = 1)
## Warning in DoHeatmap(PG_all, features = listsA, disp.min = -1, slot =
## "scale.data", : The following features were omitted as they were not found in
## the scale.data slot for the RNA assay: Foxc2, Snai1, Twist1, Zeb1, Ocln, Cdh1
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
#cluster 7 showed both epithelial and stromal markers
################################################################################
# Step 03 DEGs and GSEA analysis
################################################################################
dev.off()
## null device
## 1
PG.markers.7 <- FindMarkers(PG_all, ident.1 = "7", min.pct = 0.20)
PG.markers.7$gene <- rownames(PG.markers.7)
library(biomaRt)
gene_id <- getBM(attributes = c("ensembl_gene_id","external_gene_name","entrezgene_id"),
mart = useDataset("mmusculus_gene_ensembl", useMart("ensembl")))
PG.markers.7 <- merge(PG.markers.7, gene_id[,c(2,3)], by.x = "gene", by.y = "external_gene_name")
PG.markers.7.filt <- PG.markers.7
PG.markers.7.filt <- PG.markers.7.filt[!duplicated(PG.markers.7.filt$entrezgene_id),]
#PG.markers.7.filt <- PG.markers.7[which(PG.markers.7$p_val_adj < 0.05),]
genelist <- PG.markers.7$avg_log2FC
names(genelist) <- PG.markers.7$entrezgene_id
genelist <- sort(genelist, decreasing = TRUE)
library(fgsea)
library(msigdbr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
m_df <- msigdbr(species = "Mus musculus", category = "C2") %>% dplyr::select(gs_name, entrez_gene)
m_df_fgsea <- split(x = m_df$entrez_gene, f = m_df$gs_name)
Zhang_BasHi <- read.csv("Zhang_BasHi.csv", header = FALSE, fileEncoding="UTF-8-BOM") # add other cell markers from the publication
#Zhang_LumHi <- read.csv("Zhang_LumHi.csv") # add other cell markers from the publication
Zhang_BasHi <- data.frame(Zhang_BasHi)
colnames(Zhang_BasHi)[1] <- "gene"
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.3 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x tidyr::pack() masks Matrix::pack()
## x biomaRt::select() masks dplyr::select()
## x purrr::transpose() masks data.table::transpose()
## x tidyr::unpack() masks Matrix::unpack()
Zhang_BasHi$gene <- str_to_sentence(Zhang_BasHi$gene)
colnames(Zhang_BasHi)[1] <- "gene"
Zhang_BasHi <- merge(Zhang_BasHi, gene_id[,c(2,3)],by.x = "gene", by.y = "external_gene_name")
Zhang_BasHi <- Zhang_BasHi[,2]
m_df_fgsea[["Zhang_BasHi"]] <- Zhang_BasHi
fgseaRes <- fgsea(pathways = m_df_fgsea,
stats = genelist,
eps = 0.05,
minSize = 15,
maxSize = 800)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.37% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): There were 2 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values. For such pathways pval, padj, NES, log2err are set to NA. You
## can try to increase the value of the argument nPermSimple (for example set it
## nPermSimple = 10000)
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 0.05. You can set the `eps` argument to zero for better estimation.
selected <- fgseaRes[c(979, 329, 443, 445, 907),]
selected <- selected$pathway
plotGseaTable(m_df_fgsea[selected], genelist, fgseaRes,
gseaParam=0.5)
#### Pathview ####
dev.off()
## null device
## 1
library(pathview)
##
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
##
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
dme <- pathview(gene.data= genelist, pathway.id="mmu04310", species = "mmu")
## Loading required namespace: org.Mm.eg.db
##
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory C:/Users/lianc/Documents/R_HelenHeZhu_StemCell
## Info: Writing image file mmu04310.pathview.png
knitr::include_graphics("mmu04310.pathview.png")
################################################################################
#### Step 04 Trajectory analysis ########
################################################################################
library(monocle3)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:Matrix':
##
## expand, unname
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
##
## Attaching package: 'monocle3'
## The following objects are masked from 'package:Biobase':
##
## exprs, fData, fData<-, pData, pData<-
library(dplyr)
library(SeuratWrappers)
library(ggplot2)
data <- GetAssayData(PG_all)[VariableFeatures(PG_all),]
pd <- data.frame(PG_all@meta.data)
fData <- data.frame(gene_short_name = rownames(data), row.names = row.names(data))
cds <- new_cell_data_set(expression_data = data,
cell_metadata = pd,
gene_metadata = fData)
cds <- preprocess_cds(cds, num_dim = 12)
#plot_pc_variance_explained(cds)
cds <- reduce_dimension(cds,
preprocess_method = "PCA",
reduction_method= "tSNE",
verbose = T)
## Reduce dimension by tSNE ...
cds <- cluster_cells(cds = cds , reduction_method = "tSNE", verbose = TRUE,
resolution = NULL)
## Running leiden clustering algorithm ...
## Run kNN based graph clustering starts:
## -Input data of 9316 rows and 2 columns
## -k is set to 20
## Finding nearest neighbors...
## DONE. Run time: 0.0400000000000205s
## Compute jaccard coefficient between nearest-neighbor sets ...
## DONE. Run time: 0s
## Build undirected graph from the weighted links ...
## DONE. Run time: 0.0799999999999841s
## Run leiden clustering ...
## leiden_find_partition: partition_type CPMVertexPartition
## leiden_find_partition: seed 1778156675
## leiden_find_partition: resolution_parameter 1e-04
## leiden_find_partition: num_iter 2
## leiden_find_partition: initial_membership 0
## leiden_find_partition: edge_weights 0
## leiden_find_partition: node_sizes 0
## leiden_find_partition: number vertices 9316
## leiden_find_partition: number edges 186320
## Current resolution is 1e-04; Modularity is 0.721961664040301; Quality is 368516.3332; Significance is 137204.426592028; Number of clusters is 7
## Done. Run time: 0.172152042388916s
## Clustering statistics
## resolution_parameter quality modularity significance cluster_count selected
## 1e-04 368516.3 0.7219617 137204.4 7 *
##
## Cell counts by cluster
## cluster cell_count cell_fraction
## 1 3931 0.422
## 2 1803 0.194
## 3 1566 0.168
## 4 1552 0.167
## 5 399 0.043
## 6 37 0.004
## 7 28 0.003
##
## Maximal modularity is 0.721961664040301 for resolution parameter 1e-04
##
## Run kNN based graph clustering DONE.
## -Number of clusters: 7
p1 <- plot_cells(cds, reduction_method = "tSNE")
## No trajectory to plot. Has learn_graph() been called yet?
#compare with seurat data, res = 0.5
cds@clusters$tSNE$clusters <- PG_all$RNA_snn_res.0.5
p2 <- plot_cells(cds, reduction_method = "tSNE")
## No trajectory to plot. Has learn_graph() been called yet?
p1 + p2
# learn graph using UMAP reduction method
cds <- reduce_dimension(cds,
reduction_method= "UMAP",
verbose = T)
## No preprocess_method specified, using preprocess_method = 'PCA'
## Running Uniform Manifold Approximation and Projection
## 12:47:33 UMAP embedding parameters a = 1.577 b = 0.8951
## 12:47:33 Read 9316 rows and found 12 numeric columns
## 12:47:33 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 12:47:34 Writing NN index file to temp file C:\Users\lianc\AppData\Local\Temp\RtmpCQR6tR\file2d744c332422
## 12:47:34 Searching Annoy index using 1 thread, search_k = 1500
## 12:47:35 Annoy recall = 100%
## 12:47:36 Commencing smooth kNN distance calibration using 1 thread
## 12:47:37 Initializing from normalized Laplacian + noise
## 12:47:38 Commencing optimization for 500 epochs, with 192142 positive edges
## 12:47:53 Optimization finished
cds <- cluster_cells(cds = cds , reduction_method = "UMAP", verbose = TRUE)
## Running leiden clustering algorithm ...
## Run kNN based graph clustering starts:
## -Input data of 9316 rows and 2 columns
## -k is set to 20
## Finding nearest neighbors...
## DONE. Run time: 0.0300000000000296s
## Compute jaccard coefficient between nearest-neighbor sets ...
## DONE. Run time: 0.00999999999999091s
## Build undirected graph from the weighted links ...
## DONE. Run time: 0.0300000000000296s
## Run leiden clustering ...
## leiden_find_partition: partition_type CPMVertexPartition
## leiden_find_partition: seed 726503898
## leiden_find_partition: resolution_parameter 1e-04
## leiden_find_partition: num_iter 2
## leiden_find_partition: initial_membership 0
## leiden_find_partition: edge_weights 0
## leiden_find_partition: node_sizes 0
## leiden_find_partition: number vertices 9316
## leiden_find_partition: number edges 186320
## Current resolution is 1e-04; Modularity is 0.765461199534068; Quality is 368677.3964; Significance is 156256.528575102; Number of clusters is 8
## Done. Run time: 0.0795819759368896s
##
## Clustering statistics
## resolution_parameter quality modularity significance cluster_count selected
## 1e-04 368677.4 0.7654612 156256.5 8 *
##
## Cell counts by cluster
## cluster cell_count cell_fraction
## 1 2888 0.310
## 2 2329 0.250
## 3 2044 0.219
## 4 1309 0.141
## 5 400 0.043
## 6 277 0.030
## 7 39 0.004
## 8 30 0.003
##
## Maximal modularity is 0.765461199534068 for resolution parameter 1e-04
##
## Run kNN based graph clustering DONE.
## -Number of clusters: 8
plot_cells(cds, reduction_method = "UMAP", show_trajectory_graph = FALSE, group_label_size = 5)
cds <- learn_graph(cds, use_partition = TRUE)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
## Warning in min(data_df$weight[data_df$weight > 0]): no non-missing arguments to
## min; returning Inf
# save pre-selected cluster 7 cells as the root cells
pre_selected_cells <- colnames(subset(PG_all, idents = "7"))
cds <- order_cells(cds, reduction_method = "UMAP", root_cells = pre_selected_cells)
# plot trajectories colored by pseudotime
plot_cells(
cds = cds,
color_cells_by = "pseudotime",
show_trajectory_graph = TRUE,
group_label_size = 5,
graph_label_size = 3
)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## Cells aren't colored in a way that allows them to be grouped.
cds@clusters$UMAP$clusters <- PG_all$RNA_snn_res.0.5
plot_cells(
cds = cds,
color_cells_by = "pseudotime",
show_trajectory_graph = F,
group_label_size = 5,
graph_label_size = 3
)
## Cells aren't colored in a way that allows them to be grouped.
plot_cells(
cds = cds,
color_cells_by = "cluster",
show_trajectory_graph = FALSE,
group_label_size = 5,
graph_label_size = 3
)
plot_cells(
cds = cds,
color_cells_by = "cluster",
group_label_size = 5,
graph_label_size = 4,
label_leaves = F,
label_branch_points = F,
show_trajectory_graph = TRUE,
scale_to_range = T
)
# Add pseudotime value to the seurat object
PG_all <- AddMetaData(object = PG_all,
metadata = cds@principal_graph_aux@listData$UMAP$pseudotime,
col.name = "pseudotime")
FeaturePlot(PG_all, "pseudotime", reduction = "tsne", label = T, label.size = 8) + scale_color_viridis_c()
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
F1 <- FeaturePlot(PG_all, "pseudotime", reduction = "pca", label = T, label.size = 6, repel = T) + scale_color_viridis_c()
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
F2 <- DimPlot(PG_all, reduction = "pca", group.by = "seurat_clusters", label = T, repel = T)
ggarrange(F1, F2, ncol = 2)
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
################################################################################
#### Diffusion Map ####
################################################################################
# package 'destiny' is not available for Bioconductor version '3.13'
################################################################################
#### Slingshot ####
################################################################################
library(slingshot)
## Loading required package: princurve
## Loading required package: TrajectoryUtils
pd <- data.frame(PG_all@meta.data)
fData <- data.frame(gene_short_name = rownames(data), row.names = row.names(data))
counts <- GetAssayData(PG_all)[VariableFeatures(PG_all),]
sce <- SingleCellExperiment(list(counts = counts),
metadata = pd)
# use PCA
reducedDims(sce) <- list(pca = PG_all@reductions$pca@cell.embeddings)
reducedDims(sce)$pca <- reducedDims(sce)$pca[, 1:12]
sce
## class: SingleCellExperiment
## dim: 2000 9316
## metadata(10): orig.ident nCount_RNA ... seurat_clusters pseudotime
## assays(1): counts
## rownames(2000): Acta2 Ccl21a ... 1700097N02Rik Gm10556
## rowData names(0):
## colnames(9316): AAACCTGAGTTACGGG-1 AAACCTGCAAGGTTTC-1 ...
## TTTGTCATCTCTAAGG-1 TTTGTCATCTTACCGC-1
## colData names(0):
## reducedDimNames(1): pca
## mainExpName: NULL
## altExpNames(0):
sce <- slingshot(sce,
reducedDim = "pca",
clusterLabels = sce@metadata$RNA_snn_res.0.5,
start.clus = 7)
sds <- SlingshotDataSet(sce)
sds # has 5 or 6 lineages
## class: SlingshotDataSet
##
## Samples Dimensions
## 9316 12
##
## lineages: 6
## Lineage1: 7 0 5 2 8
## Lineage2: 7 0 5 4
## Lineage3: 7 0 5 6
## Lineage4: 7 0 3
## Lineage5: 7 0 1
## Lineage6: 7 9
##
## curves: 6
## Curve1: Length: 179.94 Samples: 4394.44
## Curve2: Length: 136.35 Samples: 3823.89
## Curve3: Length: 239.65 Samples: 3965.85
## Curve4: Length: 124.63 Samples: 3863.12
## Curve5: Length: 123.43 Samples: 4639.47
## Curve6: Length: 288.31 Samples: 444.83
##########################################################################
############## visualization for pca results ##############################
#######################################################################
# Plot clusters with identified lineages overlayed
rd <- data.frame(reducedDim(sce))
cl <- unlist(sce@metadata$RNA_snn_res.0.5)
names(cl) <- rownames(PG_all@meta.data)
par(xpd=TRUE)
par(mar=c(4.5,5.5,2,7))
plot(rd[,1:2], col = rainbow(10)[cl], asp = 0.5, pch = 20)
lines(SlingshotDataSet(sce), lwd = 2, col = 'black')
legend(x = 30, y = 50, legend= order(unique(cl)),
fill=rainbow(10)[order(unique(cl))])
# Add into Seurat object
PG_all$slingPseudotime_1 <- sce$slingPseudotime_1
PG_all$slingPseudotime_2 <- sce$slingPseudotime_2
PG_all$slingPseudotime_3 <- sce$slingPseudotime_3
PG_all$slingPseudotime_4 <- sce$slingPseudotime_4
PG_all$slingPseudotime_5 <- sce$slingPseudotime_5
PG_all$slingPseudotime_6 <- sce$slingPseudotime_6
FeaturePlot(PG_all, c("slingPseudotime_1","slingPseudotime_2","slingPseudotime_3",
"slingPseudotime_4","slingPseudotime_5","slingPseudotime_6"),
label = T, label.size = 5, cols = rainbow(12))
##########################################################################
### Visualize pseudotime using pca data
##########################################################################
## scatter3d plot, plotly
# refer to https://github.com/Dragonmasterx87/Interactive-3D-Plotting-in-Seurat-3.0.0/blob/master/3D%20UMAP%20Plotting%20v1.3.R
library(rgl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:ComplexHeatmap':
##
## add_heatmap
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Embeddings(object = PG_all, reduction = "pca")
#summary(Embeddings(object = PG_all, reduction = "pca"))
plot.data <- FetchData(object = PG_all, vars = c("PC_1", "PC_2", "PC_3", "seurat_clusters", "slingPseudotime_1","slingPseudotime_2","slingPseudotime_3","slingPseudotime_4","slingPseudotime_5","slingPseudotime_6"))
plot.data$label <- paste(rownames(plot.data))
fig <- plot_ly(data = plot.data,
x = ~PC_1, y = ~PC_2, z = ~PC_3,
color = ~seurat_clusters,
colors = rainbow(10),
type = "scatter3d",
mode = "markers",
marker = list(size = 2, width=2), # controls size of points
text=~seurat_clusters, #This is that extra column we made earlier for which we will use for cell ID
hoverinfo="text") #When you visualize your plotly object, hovering your mouse pointer over a point shows cell names
fig <- fig %>% layout(scene = list(xaxis= list(title = "PC_1"),
yaxis= list(title = "PC_2"),
zaxis= list(title = "PC_3")))
fig
## Failed to add lineage
# <- fig %>% add_trace(type = 'scatter3d', data = plot.data,
# x = ~slingPseudotime_1, y = ~slingPseudotime_2, y = ~slingPseudotime_3,
# line = list(color = 'black', width = 1))
#fig
##########################################################################
### Helen's script for the scatter3d plot
###########################################################################
Dims.pca <- sds@reducedDim[,1:3] # pca dimension
clusterLabels.pca <- sds@clusterLabels # cluster labels
connectivity.pca <- sds@adjacency #
clusters.pca <- rownames(connectivity.pca) #
nclus.pca <- nrow(connectivity.pca) #
centers.pca <- t(sapply(clusters.pca,function(x){
x.sub <- Dims.pca[rownames(clusterLabels.pca[which(clusterLabels.pca[, x] == "1"),]),]
return(colMeans(x.sub))
}))
rownames(centers.pca) <- clusters.pca # add row names
#Dims.pca <- Dims.pca[ clusterLabels.pca %in% clusters.pca, ] #
#clusterLabels.pca <- clusterLabels.pca[clusterLabels.pca %in% clusters.pca]
#
sds@lineages
## $Lineage1
## [1] "7" "0" "5" "2" "8"
##
## $Lineage2
## [1] "7" "0" "5" "4"
##
## $Lineage3
## [1] "7" "0" "5" "6"
##
## $Lineage4
## [1] "7" "0" "3"
##
## $Lineage5
## [1] "7" "0" "1"
##
## $Lineage6
## [1] "7" "9"
xs <- c(NULL, centers.pca[, 1 ])
ys <- c(NULL, centers.pca[, 2 ])
zs <- c(NULL, centers.pca[, 3 ])
xs <- c( xs, as.numeric(sapply( sds@curves, function(c){ c$s[, 1 ] }) ))
ys <- c( ys, as.numeric(sapply( sds@curves, function(c){ c$s[, 2 ] }) ))
zs <- c( zs, as.numeric(sapply( sds@curves, function(c){ c$s[, 3 ] }) ))
# straight line 3d plot
rgl::plot3d(x = NULL, y = NULL, z = NULL, aspect = 'iso',
xlim = range(xs), ylim = range(ys), zlim = range(zs),
box=FALSE, axes=FALSE, xlab = '', ylab = '', zlab = '' )
colpal = rainbow(11)
rgl::plot3d(Dims.pca, col= rainbow(10)[cl],
add=TRUE, type='p', size=4, pch=20,alpha=I(1/8),
box=FALSE, axes=FALSE)
rgl::abclines3d(max(Dims.pca[,1]),max(Dims.pca[,2]),max(Dims.pca[,3]),
a = diag(3), col = "black", lwd=2)
rgl::plot3d(centers.pca, size = 10, add = TRUE, pch = 17,
col = colpal[as.numeric(rownames(centers.pca))+1], alpha=1)
for (i in 1:(nclus.pca-1)){
for (j in (i+1):nclus.pca){
if (connectivity.pca[i,j]==1){
rgl::lines3d(x=centers.pca[c(i,j),1], y=centers.pca[c(i,j),2],
z=centers.pca[c(i,j),3],
col=colpal[as.numeric(rownames(connectivity.pca))+1],
lwd=2)
}
}
}
rgl::rgl.snapshot('Slingshot.Cluster9.branchs.straightLine2.png', fmt='png',top=TRUE)
##############################################################################
#### draw multiple lineage inference ###
#############################################################################
## order the cells on each branch
library(dplyr)
for (i in 1:ncol(slingPseudotime(sds))){
linedf <- data.frame(pseudotime=slingPseudotime(sds)[,i],
clus.labels = cl,
samples=rownames(slingPseudotime(sds)))
linedf <- linedf[order(linedf$pseudotime),]
medoids <- sapply(levels(linedf$clus.labels), function(clID){
x.sub <- linedf %>% filter(linedf$clus.labels == clID) %>% dplyr::select(pseudotime)
col <- colpal[as.numeric(as.character(linedf$clus.labels))+1][which.max(linedf$clus.labels == clID)]
return( list(means=mean(x.sub$pseudotime, na.rm=TRUE), sdev= sd(x.sub$pseudotime, na.rm=TRUE),col=col) )
})
means = unlist(medoids$means)
sdev = unlist(medoids$sdev)
col = unlist(medoids$col)
#pdf(paste('2021.08.25.Slingshot.Cluster09.Pseudo.branch_',i,'.pdf',sep=''),
# width=6,height=3)
plot(linedf$pseudotime,rep(0, length(linedf$pseudotime)),cex=3,axes=F,
pch=16, xlab='', ylab='',
col=colpal[as.numeric(as.character(linedf$clus.labels))+1],
ylim=c(-0.1, 0.1), xlim = range(linedf$pseudotime, na.rm=TRUE));
abline(h=0, col="black")
points(x=means,y=rep(0.07, length(means)), col=col, pch=19)
arrows(means-sdev, rep(0.07, length(means)), means+sdev, rep(0.07, length(means)),
length=0.05, angle=90, code=3, col=col)
#dev.off()
}